library(tidyverse)
Read raw data.
mmetsp_raw_data <- read_tsv('../data/sample-attr.tab.txt')
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────
cols(
sample_id = col_double(),
sample_name = col_character(),
attr_type = col_character(),
attr_value = col_character()
)
mmetsp_raw_data
Unpack attributes.
mmetsp_wider <- mmetsp_raw_data %>%
rename(sample_name_main = sample_name) %>%
pivot_wider(id_cols = c('sample_id', 'sample_name_main'), names_from = "attr_type", values_from = "attr_value", names_repair = "unique")
Values are not uniquely identified; output will contain list-cols.
* Use `values_fn = list` to suppress this warning.
* Use `values_fn = length` to identify where the duplicates arise
* Use `values_fn = {summary_fun}` to summarise duplicates
mmetsp_wider
mmetsp_fixed <- mmetsp_wider %>%
# only keep first sample name
rowwise() %>%
mutate(sample_name = sample_name[[1]])
mmetsp_fixed
NA
mmetsp_wider %>%
summarise(across(everything(), ~ max(lengths(.x)))) %>%
t()
[,1]
sample_id 1
sample_name_main 1
GenBank BioSample 1
NCBI SRA 1
project_id 1
source_mat_id 4
sample_name 2
latitude 2
longitude 2
habitat_name 1
taxon_id 1
strain 1
genus 1
species 1
family 1
depth 1
sample_collection_site 1
class 1
phylum 1
assembly_accession_number 1
collection_date 1
date_of_experiment 1
day_portion_of_day_night_cycle_in_hours 1
envo_term_for_habitat_primary_term 1
growth_medium 1
investigation_type 1
light 1
other_collection_site_info 1
sample_material 1
experimental_salinity 1
experimental_temperature 1
fastq_file 1
night_portion_of_day_night_cycle_in_hours 1
primary_citation 2
longhurst_province 1
ph 1
clonal 1
envo_term_for_habitat_secondary_term 1
habitat_description 1
prey_organism_if_applicable 1
environmental_salinity 1
other_experimental_metadata_available 2
country 2
axenic 1
modifications_to_growth_medium 1
additional_citations 2
collection_time 1
other_environmental_metadata_available 2
environmental_temperature 1
phosphate 1
nitrate 1
iron 1
trace_elements 1
pressure 1
carbon_dioxide 1
ammonium 1
pcr_amp 1
silicate 1
sample_type 1
sample_description 1
collection_start_time 1
collection_stop_time 1
site_name 1
site_description 1
library_acc 1
sequencing_method 1
dna_type 1
comments 1
order 1
superkingdom 1
combined_assembly_name 1
external_sample_id 1
habitat 1
principle_investigator 1
sample_volume 1
volume_unit 1
filter_min 1
filter_max 1
filter_fraction_maximum 1
filter_fraction_minimum 1
volume_filtered 1
urea 1
chlorophyll 1
elevation 1
dissolved_oxygen 1
particulate_organic_carbon 1
region 1
Select and unnest taxon info.
mmetsp_taxon <- mmetsp_wider %>%
select(sample_id, sample_name_main, taxon_id, phylum, class, order, genus, species, strain, fastq_file, latitude, longitude) %>%
unnest() %>%
mutate(
genus_species_strain = gsub(" ", "_", paste(genus, species, strain, sep = "_"))
)
`cols` is now required when using unnest().
Please use `cols = c(taxon_id, phylum, class, order, genus, species, strain, fastq_file,
latitude, longitude)`
mmetsp_taxon
mmetsp_taxon %>%
drop_na(latitude)
Select only barebones.
mmetsp_select <- mmetsp_taxon %>%
select('sample_id', 'sample_name_main', 'taxon_id', 'genus_species_strain', 'fastq_file')
colNames <- "assembly_accession, bioproject, biosample, wgs_master, refseq_category, taxid, species_taxid, organism_name, infraspecific_name, infraspecific_name2, isolateversion_status, assembly_level, release_type, genome_rep, seq_rel_date, asm_name, submitter, gbrs_paired_asm, paired_asm_comp, ftp_path, excluded_from_refseq, relation_to_type_material"
colNamesVec <- unlist(str_split(colNames, ", "))
genbank <- read_tsv('../data/assembly_summary_genbank.txt',
comment = "#",
col_names = colNamesVec) %>%
mutate(taxid = as.character(taxid),
species_taxid = as.character(species_taxid))
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────
cols(
.default = col_character(),
taxid = col_double(),
species_taxid = col_double(),
seq_rel_date = col_date(format = "")
)
ℹ Use `spec()` for the full column specifications.
46385 parsing failures.
row col expected actual file
3 -- 22 columns 10 columns '../data/assembly_summary_genbank.txt'
12 -- 22 columns 10 columns '../data/assembly_summary_genbank.txt'
26 -- 22 columns 10 columns '../data/assembly_summary_genbank.txt'
95 -- 22 columns 9 columns '../data/assembly_summary_genbank.txt'
96 -- 22 columns 9 columns '../data/assembly_summary_genbank.txt'
... ... .......... .......... ......................................
See problems(...) for more details.
genbank_select <- genbank %>%
select('taxid', 'species_taxid', 'organism_name', 'genome_rep', 'ftp_path')
genbank_select
mmetsp_taxon
genbank_select <- genbank_select %>%
mutate(taxid = as.character(taxid),
species_taxid = as.character(species_taxid))
genbank_select
genbank